(https://labs.genetics.ucla.edu/horvath/CoexpressionNetwork/Rpackages/WGCNA/Tutorials/index.html)
We begin by loading in the necessary packages and data.
library(WGCNA)
## ==========================================================================
## *
## * Package WGCNA 1.51 loaded.
## *
## * Important note: It appears that your system supports multi-threading,
## * but it is not enabled within WGCNA in R.
## * To allow multi-threading within WGCNA with all available cores, use
## *
## * allowWGCNAThreads()
## *
## * within R. Use disableWGCNAThreads() to disable threading if necessary.
## * Alternatively, set the following environment variable on your system:
## *
## * ALLOW_WGCNA_THREADS=<number_of_processors>
## *
## * for example
## *
## * ALLOW_WGCNA_THREADS=4
## *
## * To set the environment variable in linux bash shell, type
## *
## * export ALLOW_WGCNA_THREADS=4
## *
## * before running R. Other operating systems or shells will
## * have a similar command to achieve the same aim.
## *
## ==========================================================================
library(dplyr)
library(tidyr)
library(DESeq2)
library(Rtsne)
library(ggplot2)
library(topGO)
library(sva)
library(biomaRt)
load("~/Documents/git/unified_gene_expression/data/counts_processed.Rdata")
load("~/Documents/git/unified_gene_expression/data/core_metaData.Rdata")
expr = t(counts)
meta = core_tight
dim(expr)
## [1] 841 20330
dim(meta)
## [1] 9317 9
We left_join the meta sheet (for the sample_accession variable) and the expression matrix. We then subset the resulting matrix into RPE, Retina, and RPE+Retina tables.
sample_accession = rownames(expr)
expr.temp = as.data.frame(cbind(sample_accession, expr))
rownames(expr.temp)[1:5]
## [1] "E-MTAB-4377.RNA10" "E-MTAB-4377.RNA11" "E-MTAB-4377.RNA12"
## [4] "E-MTAB-4377.RNA13" "E-MTAB-4377.RNA14"
colnames(expr.temp)[1:5]
## [1] "sample_accession" "A1BG" "A1CF"
## [4] "A2M" "A2ML1"
expr.meta = left_join(expr.temp, meta, by = "sample_accession") %>% dplyr::select(-run_accession) %>% distinct()
## Warning in left_join_impl(x, y, by$x, by$y, suffix$x, suffix$y): joining
## character vector and factor, coercing into character vector
expr.temp$sample_accession = as.character(expr.temp$sample_accession)
expr.temp$sample_accession[1:5]
## [1] "E-MTAB-4377.RNA10" "E-MTAB-4377.RNA11" "E-MTAB-4377.RNA12"
## [4] "E-MTAB-4377.RNA13" "E-MTAB-4377.RNA14"
expr.meta$sample_accession[1:5]
## [1] "E-MTAB-4377.RNA10" "E-MTAB-4377.RNA11" "E-MTAB-4377.RNA12"
## [4] "E-MTAB-4377.RNA13" "E-MTAB-4377.RNA14"
identical(expr.temp$sample_accession, expr.meta$sample_accession)
## [1] TRUE
rownames(expr.meta) = expr.meta$sample_accession
expr.meta[1:5, 1:3]
## sample_accession A1BG A1CF
## E-MTAB-4377.RNA10 E-MTAB-4377.RNA10 9 6.00811
## E-MTAB-4377.RNA11 E-MTAB-4377.RNA11 25 14.0293
## E-MTAB-4377.RNA12 E-MTAB-4377.RNA12 22.99999 11.0124
## E-MTAB-4377.RNA13 E-MTAB-4377.RNA13 27 4.00845
## E-MTAB-4377.RNA14 E-MTAB-4377.RNA14 28 1.00068
class(expr.meta)
## [1] "data.frame"
table(expr.meta$Tissue)
##
## Adipose Tissue Adrenal Gland Blood Blood Vessel
## 20 10 20 30
## Brain Breast Colon Esophagus
## 130 10 20 30
## Heart Liver Lung Muscle
## 20 10 10 10
## Nerve Pancreas Pituitary Salivary Gland
## 10 10 10 10
## Skin Small Intestine Spleen Stomach
## 31 10 10 10
## Thyroid Cornea ENCODE Cell Line Retina
## 10 28 112 107
## RPE
## 76
expr.retina = filter(expr.meta, Tissue == "Retina")
expr.rpe = filter(expr.meta, Tissue == "RPE")
expr.retina.rpe = filter(expr.meta, Tissue == "Retina" | Tissue == "RPE")
rownames(expr.retina) = expr.retina$sample_accession
rownames(expr.rpe) = expr.rpe$sample_accession
rownames(expr.retina.rpe) = expr.retina.rpe$sample_accession
expr.retina = expr.retina[, c(2:ncol(expr.retina), 1)]
expr.rpe = expr.rpe[, c(2:ncol(expr.rpe), 1)]
expr.retina.rpe = expr.retina.rpe[, c(2:ncol(expr.retina.rpe), 1)]
expr.retina[1:5, 1:5]
## A1BG A1CF A2M A2ML1 A3GALT2
## E-MTAB-4377.RNA10 9 6.00811 2407.96 22.78949 5
## E-MTAB-4377.RNA11 25 14.0293 5608.974 44.42833 0
## E-MTAB-4377.RNA12 22.99999 11.0124 4419 15.05002 1
## E-MTAB-4377.RNA13 27 4.00845 914.999 11.280146 0
## E-MTAB-4377.RNA14 28 1.00068 1160 6.28242 1
expr.rpe[1:5, 1:5]
## A1BG A1CF A2M A2ML1 A3GALT2
## SRS1054265 70.00001 16.0123 3488.61 33.25127 3
## SRS1054266 514 7.0022000134004 53 24.35175 3
## SRS1054267 343.00027 15.0113000190766 53.98 53.17001 3
## SRS1054268 307.00013 13.0007 292.99961 14.85531 0
## SRS1054269 235.00018 9.00208 65 25.132097 3
expr.retina.rpe[1:5, 1:5]
## A1BG A1CF A2M A2ML1 A3GALT2
## E-MTAB-4377.RNA10 9 6.00811 2407.96 22.78949 5
## E-MTAB-4377.RNA11 25 14.0293 5608.974 44.42833 0
## E-MTAB-4377.RNA12 22.99999 11.0124 4419 15.05002 1
## E-MTAB-4377.RNA13 27 4.00845 914.999 11.280146 0
## E-MTAB-4377.RNA14 28 1.00068 1160 6.28242 1
dim(expr.retina)
## [1] 107 20338
dim(expr.rpe)
## [1] 76 20338
dim(expr.retina.rpe)
## [1] 183 20338
save(expr.retina, file = "~/Documents/git/unified_gene_expression/data/counts_retina.Rdata")
save(expr.rpe, file = "~/Documents/git/unified_gene_expression/data/counts_rpe.Rdata")
save(expr.retina.rpe, file = "~/Documents/git/unified_gene_expression/data/counts_retina_rpe.Rdata")
We first build the network with just the samples from the retina.
#####################################
## Retina Network ##
#####################################
meta.retina = expr.retina[, which(colnames(expr.retina) == "study_accession"):ncol(expr.retina)]
expr.retina = expr.retina[, 1:(which(colnames(expr.retina) == "study_accession") - 1)]
expr.retina[] = sapply(expr.retina, function(f) as.numeric(levels(f))[f])
save(expr.retina, file = "~/Documents/git/unified_gene_expression/data/expr_retina.Rdata")
save(meta.retina, file = "~/Documents/git/unified_gene_expression/data/meta_retina.Rdata")
We engage in some pre-processing of the data. We first filter out any genes for which at least 10% of samples have less than 10 counts. We then apply a variance stabilizing transformation and check to see that all the genes are satisfactory (in terms of variance and low expression).
min.expr.indices = which(colSums(expr.retina >= 10) >= 0.9*nrow(expr.retina))
expr.retina = expr.retina[, min.expr.indices]
expr.retina.var.stab = varianceStabilizingTransformation(round(as.matrix(expr.retina)))
save(expr.retina.var.stab, file = "~/Documents/git/unified_gene_expression/data/counts_retina_varStab.Rdata")
gsg = goodSamplesGenes(expr.retina.var.stab)
## Flagging genes and samples with too many missing values...
## ..step 1
gsg$allOK
## [1] TRUE
We perform hierarchical cluster (using average linkage) on the samples to look for any outliers. We notice that the samples are clustering based on which study they came from.
sampleTree = hclust(dist(expr.retina.var.stab), method = "average")
# Convert traits to a color representation: white means low, red means high, grey means missing entry
traitColors = numbers2colors(as.numeric(factor(meta.retina$study_accession)), signed = FALSE);
# Plot the sample dendrogram and the colors underneath.
plotDendroAndColors(sampleTree, traitColors,
groupLabels = "study_accession",
main = "Sample dendrogram and study source (study_accesssion)")
We find in the dendrogram plot that there are batch effects with the different data sources. We further investigate these batch issues using t-SNE (t-Stochastic Neighbor Embedding).
set.seed(47)
perplexities = seq(10, 30, by = 10)
for(p in perplexities){
tsne_out = Rtsne(as.matrix(expr.retina.var.stab), perplexity = p, check_duplicates = F, theta = 0.0)
tsne_plot = data.frame(tsne_out$Y)
p1 = tsne_plot %>%
ggplot(aes(x=X1,y=X2,colour=factor(meta.retina$study_accession))) +
geom_point(size=4) + ggtitle(paste0("t-sne. Perplexity = ", p))
print(p1)
p2 = tsne_plot %>%
ggplot(aes(x=X1,y=X2,colour=factor(meta.retina$Sub_Tissue))) +
geom_point(size=4) + ggtitle(paste0("t-sne. Perplexity = ", p))
print(p2)
}
We then use Combat (from the sva package) to adjust for the batch issues (fetal vs. adult, study source).
expr.retina.combat = as.data.frame(t(expr.retina.var.stab))
# Remove the data from studies that have very few observations
small.study.indices = which(meta.retina$study_accession %in% c("SRP002881", "SRP015336"))
expr.retina.combat = expr.retina.combat[, -(small.study.indices)]
meta.retina.combat = meta.retina[-(small.study.indices), ]
meta.retina.combat.mergeVar = meta.retina.combat %>%
unite(study_subTissue, study_accession, Sub_Tissue)
modCombat = model.matrix(~1, data = meta.retina.combat.mergeVar)
combat_expr = t(ComBat(dat = expr.retina.combat, batch = factor(meta.retina.combat.mergeVar$study_subTissue),
mod = modCombat, par.prior=TRUE, prior.plots=FALSE))
## Found 4 batches
## Adjusting for 0 covariate(s) or covariate level(s)
## Standardizing Data across genes
## Fitting L/S model and finding priors
## Finding parametric adjustments
## Adjusting the Data
Following batch correction with Combat, we see that the batch issues are resolved, as clustering is minimal in t-SNE.
set.seed(47)
perplexities = seq(10, 30, by = 10)
for(p in perplexities){
tsne_out = Rtsne(as.matrix(combat_expr), perplexity = p, check_duplicates = F, theta = 0.0)
tsne_plot = data.frame(tsne_out$Y)
p1 = tsne_plot %>%
ggplot(aes(x=X1,y=X2,colour=factor(meta.retina.combat.mergeVar$study_subTissue))) +
geom_point(size=4) + ggtitle(paste0("t-sne (post-Combat adjustment). Perplexity = ", p))
print(p1)
p2 = tsne_plot %>%
ggplot(aes(x=X1,y=X2,colour=factor(meta.retina.combat.mergeVar$study_subTissue))) +
geom_point(size=4) + ggtitle(paste0("t-sne (post-Combat adjustment). Perplexity = ", p))
print(p2)
}
Following batch correction, we see that the distinct clusters are no longer present. Now we recluster the samples with hierarchical clustering (average linkage). The dendrogram shows that the batch issues have been resolved (at least partially).
sampleTree.combat = hclust(dist(combat_expr), method = "average")
# Convert traits to a color representation: white means low, red means high, grey means missing entry
traitColors = numbers2colors(as.numeric(factor(meta.retina.combat.mergeVar$study_subTissue)), signed = FALSE);
# Plot the sample dendrogram and the colors underneath.
plotDendroAndColors(sampleTree.combat, traitColors,
groupLabels = "study_accession",
main = "Sample dendrogram and trait heatmap")
Now that we have resolved the batch effect issue, we can resume building a WGCNA network, exploring both signed and unsigned networks. We first begin with automatic network building.
options(stringsAsFactors = FALSE)
####################################
## Unsigned ##
####################################
# Choose a set of soft-thresholding powers
powers.unsigned = c(c(1:10), seq(from = 12, to=30, by=2))
# Call the network topology analysis function
sft.unsigned = pickSoftThreshold(combat_expr, powerVector = powers.unsigned, networkType = "unsigned")
## Warning: executing %dopar% sequentially: no parallel backend registered
## Power SFT.R.sq slope truncated.R.sq mean.k. median.k. max.k.
## 1 1 0.7680 2.330 0.859 6020.00 6390.000 8440.0
## 2 2 0.2570 0.551 0.593 3280.00 3440.000 5820.0
## 3 3 0.0426 -0.197 0.479 1980.00 1990.000 4250.0
## 4 4 0.2940 -0.595 0.637 1270.00 1210.000 3230.0
## 5 5 0.4650 -0.842 0.741 860.00 757.000 2520.0
## 6 6 0.5520 -1.050 0.787 601.00 491.000 2010.0
## 7 7 0.6060 -1.200 0.818 433.00 328.000 1640.0
## 8 8 0.6570 -1.290 0.850 319.00 222.000 1350.0
## 9 9 0.6940 -1.370 0.874 240.00 154.000 1120.0
## 10 10 0.7250 -1.420 0.893 183.00 109.000 940.0
## 11 12 0.7760 -1.480 0.929 111.00 57.000 676.0
## 12 14 0.8070 -1.530 0.950 70.90 31.600 502.0
## 13 16 0.8180 -1.590 0.960 46.60 18.200 383.0
## 14 18 0.8240 -1.630 0.967 31.60 10.800 296.0
## 15 20 0.8330 -1.670 0.972 21.90 6.550 232.0
## 16 22 0.8460 -1.680 0.978 15.50 4.110 184.0
## 17 24 0.8570 -1.690 0.982 11.20 2.580 147.0
## 18 26 0.8600 -1.700 0.985 8.17 1.660 118.0
## 19 28 0.8670 -1.710 0.987 6.05 1.100 95.9
## 20 30 0.8770 -1.720 0.992 4.54 0.732 78.7
# With a power of 14 we get
# Power SFT.R.sq slope truncated.R.sq mean.k. median.k. max.k.
# 14 0.8070 -1.530 0.950 70.90 31.600 502.0
####################################
## Signed ##
####################################
# Choose a set of soft-thresholding powers
powers.signed = c(c(1:10), seq(from = 12, to=30, by=2))
# Call the network topology analysis function
sft.signed = pickSoftThreshold(combat_expr, powerVector = powers.signed, networkType = "signed")
## Power SFT.R.sq slope truncated.R.sq mean.k. median.k. max.k.
## 1 1 0.8860 9.380 0.938 9800.0 10000.0 11100
## 2 2 0.8060 4.030 0.897 7180.0 7440.0 9080
## 3 3 0.6770 2.130 0.831 5380.0 5610.0 7540
## 4 4 0.4170 1.050 0.692 4120.0 4300.0 6380
## 5 5 0.1660 0.521 0.592 3210.0 3320.0 5450
## 6 6 0.0136 0.133 0.524 2530.0 2600.0 4700
## 7 7 0.0190 -0.157 0.539 2030.0 2040.0 4090
## 8 8 0.1010 -0.360 0.601 1640.0 1620.0 3580
## 9 9 0.2010 -0.519 0.659 1340.0 1300.0 3150
## 10 10 0.3010 -0.652 0.710 1110.0 1040.0 2790
## 11 12 0.4470 -0.893 0.769 773.0 686.0 2230
## 12 14 0.5350 -1.090 0.806 554.0 461.0 1810
## 13 16 0.5960 -1.230 0.832 406.0 317.0 1490
## 14 18 0.6470 -1.320 0.860 304.0 221.0 1240
## 15 20 0.6860 -1.390 0.880 231.0 156.0 1040
## 16 22 0.7270 -1.430 0.905 178.0 112.0 877
## 17 24 0.7550 -1.470 0.921 139.0 81.5 746
## 18 26 0.7690 -1.500 0.929 110.0 60.1 638
## 19 28 0.7950 -1.510 0.944 87.8 44.9 549
## 20 30 0.8030 -1.550 0.950 70.7 34.0 478
# With a power of 30 we get
# Power SFT.R.sq slope truncated.R.sq mean.k. median.k. max.k.
# 30 0.8030 -1.550 0.950 70.7 34.0 478
We achieve scale-free topology indices of 0.8 for powers of 14 and 30 for unsigned and signed networks, respectively. We build the networks using these power parameters. Upon examining the resulting gene dendrograms, it’s clear that the unsigned network is superior.
net.unsigned = blockwiseModules(combat_expr, power = 14, maxBlockSize = 14000,
TOMType = "unsigned", minModuleSize = 30,
reassignThreshold = 0, mergeCutHeight = 0.1,
numericLabels = TRUE, pamRespectsDendro = FALSE,
verbose = 3)
## Calculating module eigengenes block-wise from all genes
## Flagging genes and samples with too many missing values...
## ..step 1
## ..Working on block 1 .
## TOM calculation: adjacency..
## adjacency: replaceMissing: 0
## ..will not use multithreading.
## Fraction of slow calculations: 0.000000
## ..connectivity..
## ..matrix multiplication..
## ..normalization..
## ..done.
## ....clustering..
## ....detecting modules..
## ....calculating module eigengenes..
## ....checking modules for statistical meaningfulness..
## ..merging modules that are too close..
## mergeCloseModules: Merging modules whose distance is less than 0.1
## Calculating new MEs...
net.signed = blockwiseModules(combat_expr, power = 30, maxBlockSize = 14000,
TOMType = "signed", minModuleSize = 30,
reassignThreshold = 0, mergeCutHeight = 0.05,
numericLabels = TRUE, pamRespectsDendro = FALSE,
verbose = 3)
## Calculating module eigengenes block-wise from all genes
## Flagging genes and samples with too many missing values...
## ..step 1
## ..Working on block 1 .
## TOM calculation: adjacency..
## adjacency: replaceMissing: 0
## ..will not use multithreading.
## Fraction of slow calculations: 0.000000
## ..connectivity..
## ..matrix multiplication..
## ..normalization..
## ..done.
## ....clustering..
## ....detecting modules..
## ....calculating module eigengenes..
## ....checking modules for statistical meaningfulness..
## ..merging modules that are too close..
## mergeCloseModules: Merging modules whose distance is less than 0.05
## Calculating new MEs...
save(net.unsigned, file = "~/Documents/git/unified_gene_expression/data/retina_net_unsigned.Rdata")
save(net.signed, file = "~/Documents/git/unified_gene_expression/data/retina_net_signed.Rdata")
# Examining number of modules
table(net.unsigned$colors)
##
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14
## 643 6464 2036 1087 749 449 283 255 254 244 187 161 152 122 106
## 15 16 17 18 19 20 21 22 23
## 91 84 79 75 62 61 49 48 33
# Convert labels to colors for plotting
mergedColors = labels2colors(net.unsigned$colors)
# Plot the dendrogram and the module colors underneath
plotDendroAndColors(net.unsigned$dendrograms[[1]], mergedColors[net.unsigned$blockGenes[[1]]],
"Module colors",
dendroLabels = FALSE, hang = 0.03,
addGuide = TRUE, guideHang = 0.05)
# Examining number of modules
table(net.signed$colors)
##
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14
## 6892 3014 764 278 266 206 194 190 189 181 148 141 128 115 115
## 15 16 17 18 19 20 21 22 23 24 25 26 27
## 110 109 102 94 80 76 61 58 56 56 56 53 42
# Convert labels to colors for plotting
mergedColors = labels2colors(net.signed$colors)
# Plot the dendrogram and the module colors underneath
plotDendroAndColors(net.signed$dendrograms[[1]], mergedColors[net.signed$blockGenes[[1]]],
"Module colors",
dendroLabels = FALSE, hang = 0.03,
addGuide = TRUE, guideHang = 0.05)
We now consider building a network using the TOM (Topological Overlap Matrix) and hierarchical clustering, as opposed to the automatic network construction performed above.
softPower = 14
adjacency = adjacency(combat_expr, power = softPower)
# Turn adjacency into topological overlap
TOM = TOMsimilarity(adjacency)
## ..connectivity..
## ..matrix multiplication..
## ..normalization..
## ..done.
dissTOM = 1-TOM
# Call the hierarchical clustering function
geneTree = hclust(as.dist(dissTOM), method = "average");
# Plot the resulting clustering tree (dendrogram)
plot(geneTree, xlab="", sub="", main = "Gene clustering on TOM-based dissimilarity",
labels = FALSE, hang = 0.04)
# We like large modules, so we set the minimum module size relatively high:
minModuleSize = 30;
# Module identification using dynamic tree cut:
dynamicMods = cutreeDynamic(dendro = geneTree, distM = dissTOM,
deepSplit = 2, pamRespectsDendro = FALSE,
minClusterSize = minModuleSize);
## ..cutHeight not given, setting it to 0.999 ===> 99% of the (truncated) height range in dendro.
## ..done.
table(dynamicMods)
## dynamicMods
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14
## 259 1587 858 802 557 511 446 405 331 331 284 265 221 219 215
## 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29
## 214 210 206 195 185 184 176 171 169 166 158 158 156 151 151
## 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44
## 151 150 149 148 146 139 137 134 130 129 126 118 117 116 105
## 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
## 99 97 97 97 96 95 92 89 88 87 82 78 76 76 76
## 60 61 62 63 64 65 66 67 68
## 67 66 65 62 58 53 50 49 43
# Convert numeric lables into colors
dynamicColors = labels2colors(dynamicMods)
table(dynamicColors)
## dynamicColors
## antiquewhite4 bisque4 black blue
## 66 97 405 858
## brown brown4 coral1 coral2
## 802 97 67 65
## cyan darkgreen darkgrey darkmagenta
## 215 171 166 146
## darkolivegreen darkorange darkorange2 darkred
## 148 158 99 176
## darkseagreen4 darkslateblue darkturquoise floralwhite
## 76 97 169 105
## green greenyellow grey grey60
## 511 265 259 206
## honeydew1 ivory lavenderblush3 lightcyan
## 76 116 76 210
## lightcyan1 lightgreen lightpink4 lightsteelblue1
## 117 195 78 118
## lightyellow magenta maroon mediumorchid
## 185 331 82 62
## mediumpurple3 midnightblue navajowhite2 orange
## 126 214 87 158
## orangered3 orangered4 paleturquoise palevioletred3
## 43 129 150 88
## pink plum plum1 plum2
## 331 49 130 96
## purple red royalblue saddlebrown
## 284 446 184 151
## salmon salmon4 sienna3 skyblue
## 219 89 139 151
## skyblue1 skyblue2 skyblue3 steelblue
## 50 58 134 151
## tan thistle1 thistle2 turquoise
## 221 92 95 1587
## violet white yellow yellow4
## 149 156 557 53
## yellowgreen
## 137
# Plot the dendrogram and colors underneath
plotDendroAndColors(geneTree, dynamicColors, "Dynamic Tree Cut",
dendroLabels = FALSE, hang = 0.0005,
addGuide = TRUE, guideHang = 0.05,
main = "Gene dendrogram and module colors")
We then calculate the eigengenes for the different modules.
# Calculate eigengenes
MEList = moduleEigengenes(combat_expr, colors = dynamicColors)
MEs = MEList$eigengenes
# Calculate dissimilarity of module eigengenes
MEDiss = 1-cor(MEs);
# Cluster module eigengenes
METree = hclust(as.dist(MEDiss), method = "average");
# Plot the result
plot(METree, main = "Clustering of module eigengenes",
xlab = "", sub = "")
MEDissThres = 0.05 # used to be 0.15
# Plot the cut line into the dendrogram
abline(h=MEDissThres, col = "red")
# Call an automatic merging function
merge = mergeCloseModules(combat_expr, dynamicColors, cutHeight = MEDissThres, verbose = 3)
## mergeCloseModules: Merging modules whose distance is less than 0.05
## multiSetMEs: Calculating module MEs.
## Working on set 1 ...
## moduleEigengenes: Calculating 69 module eigengenes in given set.
## multiSetMEs: Calculating module MEs.
## Working on set 1 ...
## moduleEigengenes: Calculating 58 module eigengenes in given set.
## multiSetMEs: Calculating module MEs.
## Working on set 1 ...
## moduleEigengenes: Calculating 56 module eigengenes in given set.
## Calculating new MEs...
## multiSetMEs: Calculating module MEs.
## Working on set 1 ...
## moduleEigengenes: Calculating 56 module eigengenes in given set.
# The merged module colors
mergedColors = merge$colors
# Eigengenes of the new merged modules:
mergedMEs = merge$newMEs
plotDendroAndColors(geneTree, cbind(dynamicColors, mergedColors),
c("Dynamic Tree Cut", "Merged dynamic"),
dendroLabels = FALSE, hang = 0.03,
addGuide = TRUE, guideHang = 0.05)
# Rename to moduleColors
moduleColors = mergedColors
# Construct numerical labels corresponding to the colors
colorOrder = c("grey", standardColors(50));
moduleLabels = match(moduleColors, colorOrder)-1;
MEs = mergedMEs;
# Save module colors and labels for use in subsequent parts
save(MEs, moduleLabels, moduleColors, geneTree, file = "~/Documents/git/unified_gene_expression/data/networkConstruction-stepByStep.RData")
We now build a signed network.
powers = c(1:10, seq(from = 12, to = 20, by = 2))
sft = pickSoftThreshold(combat_expr, powerVector = powers, verbose = 5)
## pickSoftThreshold: will use block size 3248.
## pickSoftThreshold: calculating connectivity for given powers...
## ..working on genes 1 through 3248 of 13774
## ..working on genes 3249 through 6496 of 13774
## ..working on genes 6497 through 9744 of 13774
## ..working on genes 9745 through 12992 of 13774
## ..working on genes 12993 through 13774 of 13774
## Power SFT.R.sq slope truncated.R.sq mean.k. median.k. max.k.
## 1 1 0.7680 2.330 0.859 6020.0 6390.00 8440
## 2 2 0.2570 0.551 0.593 3280.0 3440.00 5820
## 3 3 0.0426 -0.197 0.479 1980.0 1990.00 4250
## 4 4 0.2940 -0.595 0.637 1270.0 1210.00 3230
## 5 5 0.4650 -0.842 0.741 860.0 757.00 2520
## 6 6 0.5520 -1.050 0.787 601.0 491.00 2010
## 7 7 0.6060 -1.200 0.818 433.0 328.00 1640
## 8 8 0.6570 -1.290 0.850 319.0 222.00 1350
## 9 9 0.6940 -1.370 0.874 240.0 154.00 1120
## 10 10 0.7250 -1.420 0.893 183.0 109.00 940
## 11 12 0.7760 -1.480 0.929 111.0 57.00 676
## 12 14 0.8070 -1.530 0.950 70.9 31.60 502
## 13 16 0.8180 -1.590 0.960 46.6 18.20 383
## 14 18 0.8240 -1.630 0.967 31.6 10.80 296
## 15 20 0.8330 -1.670 0.972 21.9 6.55 232
{# Scale-free topology fit index as a function of the soft-thresholding power
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n",
main = paste("Scale independence"));
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2], labels=powers,col="red");
# this line corresponds to using an R^2 cut-off of h
abline(h=0.80,col="red")}
# Mean connectivity as a function of the soft-thresholding power
{plot(sft$fitIndices[,1], sft$fitIndices[,5],
xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers,col="red")}
softPower = 14
adjacency = adjacency(combat_expr, power = softPower)
# Turn adjacency into topological overlap
TOM = TOMsimilarity(adjacency)
## ..connectivity..
## ..matrix multiplication..
## ..normalization..
## ..done.
dissTOM = 1-TOM
# Call the hierarchical clustering function
geneTree = hclust(as.dist(dissTOM), method = "average");
# Plot the resulting clustering tree (dendrogram)
plot(geneTree, xlab="", sub="", main = "Gene clustering on TOM-based dissimilarity",
labels = FALSE, hang = 0.04)
# We like large modules, so we set the minimum module size relatively high:
minModuleSize = 30;
# Module identification using dynamic tree cut:
dynamicMods = cutreeDynamic(dendro = geneTree, distM = dissTOM,
deepSplit = 2, pamRespectsDendro = FALSE,
minClusterSize = minModuleSize);
## ..cutHeight not given, setting it to 0.999 ===> 99% of the (truncated) height range in dendro.
## ..done.
table(dynamicMods)
## dynamicMods
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14
## 259 1587 858 802 557 511 446 405 331 331 284 265 221 219 215
## 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29
## 214 210 206 195 185 184 176 171 169 166 158 158 156 151 151
## 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44
## 151 150 149 148 146 139 137 134 130 129 126 118 117 116 105
## 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
## 99 97 97 97 96 95 92 89 88 87 82 78 76 76 76
## 60 61 62 63 64 65 66 67 68
## 67 66 65 62 58 53 50 49 43
# Convert numeric lables into colors
dynamicColors = labels2colors(dynamicMods)
table(dynamicColors)
## dynamicColors
## antiquewhite4 bisque4 black blue
## 66 97 405 858
## brown brown4 coral1 coral2
## 802 97 67 65
## cyan darkgreen darkgrey darkmagenta
## 215 171 166 146
## darkolivegreen darkorange darkorange2 darkred
## 148 158 99 176
## darkseagreen4 darkslateblue darkturquoise floralwhite
## 76 97 169 105
## green greenyellow grey grey60
## 511 265 259 206
## honeydew1 ivory lavenderblush3 lightcyan
## 76 116 76 210
## lightcyan1 lightgreen lightpink4 lightsteelblue1
## 117 195 78 118
## lightyellow magenta maroon mediumorchid
## 185 331 82 62
## mediumpurple3 midnightblue navajowhite2 orange
## 126 214 87 158
## orangered3 orangered4 paleturquoise palevioletred3
## 43 129 150 88
## pink plum plum1 plum2
## 331 49 130 96
## purple red royalblue saddlebrown
## 284 446 184 151
## salmon salmon4 sienna3 skyblue
## 219 89 139 151
## skyblue1 skyblue2 skyblue3 steelblue
## 50 58 134 151
## tan thistle1 thistle2 turquoise
## 221 92 95 1587
## violet white yellow yellow4
## 149 156 557 53
## yellowgreen
## 137
# Plot the dendrogram and colors underneath
plotDendroAndColors(geneTree, dynamicColors, "Dynamic Tree Cut",
dendroLabels = FALSE, hang = 0.0003,
addGuide = TRUE, guideHang = 0.05,
main = "Gene dendrogram and module colors")
We then calculate the eigengenes.
# Calculate eigengenes
MEList = moduleEigengenes(combat_expr, colors = dynamicColors)
MEs = MEList$eigengenes
# Calculate dissimilarity of module eigengenes
MEDiss = 1-cor(MEs);
# Cluster module eigengenes
METree = hclust(as.dist(MEDiss), method = "average");
# Plot the result
{plot(METree, main = "Clustering of module eigengenes",
xlab = "", sub = "")
MEDissThres = 0.1 # used to be 0.15
# Plot the cut line into the dendrogram
abline(h=MEDissThres, col = "red")}
# Call an automatic merging function
merge = mergeCloseModules(combat_expr, dynamicColors, cutHeight = MEDissThres, verbose = 3)
## mergeCloseModules: Merging modules whose distance is less than 0.1
## multiSetMEs: Calculating module MEs.
## Working on set 1 ...
## moduleEigengenes: Calculating 69 module eigengenes in given set.
## multiSetMEs: Calculating module MEs.
## Working on set 1 ...
## moduleEigengenes: Calculating 34 module eigengenes in given set.
## multiSetMEs: Calculating module MEs.
## Working on set 1 ...
## moduleEigengenes: Calculating 29 module eigengenes in given set.
## multiSetMEs: Calculating module MEs.
## Working on set 1 ...
## moduleEigengenes: Calculating 27 module eigengenes in given set.
## multiSetMEs: Calculating module MEs.
## Working on set 1 ...
## moduleEigengenes: Calculating 26 module eigengenes in given set.
## multiSetMEs: Calculating module MEs.
## Working on set 1 ...
## moduleEigengenes: Calculating 25 module eigengenes in given set.
## Calculating new MEs...
## multiSetMEs: Calculating module MEs.
## Working on set 1 ...
## moduleEigengenes: Calculating 25 module eigengenes in given set.
# The merged module colors
mergedColors = merge$colors
table(mergedColors)
## mergedColors
## bisque4 cyan darkmagenta darkorange2
## 97 215 5345 2075
## darkred darkseagreen4 darkslateblue darkturquoise
## 1801 76 476 265
## green grey lightcyan lightcyan1
## 795 259 210 268
## lightpink4 lightsteelblue1 maroon mediumorchid
## 78 118 361 62
## orangered4 salmon skyblue skyblue1
## 129 219 151 50
## steelblue thistle2 white yellow4
## 283 95 156 53
## yellowgreen
## 137
# Eigengenes of the new merged modules:
mergedMEs = merge$newMEs
plotDendroAndColors(geneTree, cbind(dynamicColors, mergedColors),
c("Dynamic Tree Cut", "Merged dynamic"),
dendroLabels = FALSE, hang = 0.03,
addGuide = TRUE, guideHang = 0.05)
We obtain the TOM and visualize the network using a heat map. Plotting on all genes is very computationally intensive, so here we just compute the TOM on a randomly selected subset of 400 genes.
# Transform dissTOM with a power to make moderately strong connections more visible in the heatmap
plotTOM = dissTOM^7;
# Set diagonal to NA for a nicer plot
diag(plotTOM) = NA;
# Call the plot function
# plotting on
#TOMplot(plotTOM, geneTree, mergedColors, main = "Network heatmap plot, all genes")
nSelect = 400
# For reproducibility, we set the random seed
set.seed(47);
select = sample(ncol(combat_expr), size = nSelect);
selectTOM = dissTOM[select, select];
# There’s no simple way of restricting a clustering tree to a subset of genes, so we must re-cluster.
selectTree = hclust(as.dist(selectTOM), method = "average")
selectColors = mergedColors[select];
# Taking the dissimilarity to a power, say 10, makes the plot more informative by effectively changing
# the color palette; setting the diagonal to NA also improves the clarity of the plot
plotDiss = selectTOM^7;
diag(plotDiss) = NA;
TOMplot(plotDiss, selectTree, selectColors, main = "Network heatmap plot, selected genes")
library(GO.db)
library(AnnotationDbi)
library(org.Hs.eg.db)
# Load annotation file
load("~/Documents/git/unified_gene_expression/data/gencode_v25_annotation.Rdata")
# Read in the probe annotation
annot = gencode_v25_annotation %>% dplyr::select(-Transcript.ID) %>% distinct()
n_distinct(annot$Gene.ID)
## [1] 20378
n_distinct(annot$Gene.Name)
## [1] 20330
# Removing alternative splicing artifacts
annot[, 'Gene.ID'] = gsub("\\.[^.]*$", "", annot$Gene.ID)
annot = annot %>% distinct()
dim(annot)
## [1] 20360 2
# Match probes in the data set to the probe IDs in the annotation file
probes = colnames(combat_expr)
probes2annot = which(annot$Gene.Name %in% probes)
# Get the corresponding Gene IDs
allGeneIDs = annot[probes2annot, ]
# (For now - need new method) eliminate redundancy for gene names
allGeneIDs = allGeneIDs[!duplicated(allGeneIDs$Gene.Name),]
dim(allGeneIDs)
## [1] 13774 2
mart <- useMart("ensembl")
datasets <- listDatasets(mart)
mart<-useDataset("hsapiens_gene_ensembl",mart)
entrez_ids = getBM(attributes=c("ensembl_gene_id", "entrezgene"),filters="ensembl_gene_id",values=allGeneIDs, mart=mart)
test = merge(allGeneIDs,entrez_ids,by.x="Gene.ID",by.y="ensembl_gene_id") %>% distinct()
moduleColors = labels2colors(net.unsigned$colors)
# $ Choose interesting modules
intModules = c("brown", "green", "red")
for (module in intModules)
{
# Select module probes
modGenes = (moduleColors==module)
# Get their entrez ID codes
modGeneIDs = allGeneIDs[modGenes];
# Write them into a file
fileName = paste("~/Documents/git/unified_gene_expression/results/GeneIDs-", module, ".txt", sep="");
write.table(as.data.frame(modGeneIDs), file = fileName,
row.names = FALSE, col.names = FALSE)
}
# As background in the enrichment analysis, we will use all probes in the analysis.
fileName = paste("~/Documents/git/unified_gene_expression/results/GeneIDs-all.txt", sep="");
write.table(as.data.frame(allGeneIDs), file = fileName,
row.names = FALSE, col.names = FALSE)
Issue to resolve: converting from ensembl gene IDs to entrez gene IDs.
#GOenr = GOenrichmentAnalysis(moduleColors, allGeneIDs, organism = "human", nBestP = 10)